# Reading in data
import pandas as pd
data=pd.read_csv("StudentsPerformance.csv")
data.head()


# Number of Rows
print(data.shape[0])
# Column Names
print(data.columns.tolist())
# Data Types
print(data.dtypes)

1000
['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course', 'math score', 'reading score', 'writing score']
gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object


def letter_grade(score):
    if score >=90:
        return 'A'
    elif score >=80:
        return 'B'
    elif score >=70:
        return 'C'
    elif score >=60:
        return 'D'
    else:
        return 'F'
# Math
data['Math Grade'] = data['math score'].map(letter_grade)
print(data['Math Grade'].value_counts())
# Reading
data['Reading Grade'] = data['reading score'].map(letter_grade)
print(data['Reading Grade'].value_counts())
#Writing
data['Writing Grade'] = data['writing score'].map(letter_grade)
print(data['Writing Grade'].value_counts())

F    323
D    268
C    216
B    135
A     58
Name: Math Grade, dtype: int64
C    264
F    254
D    233
B    170
A     79
Name: Reading Grade, dtype: int64
F    281
C    254
D    230
B    157
A     78
Name: Writing Grade, dtype: int64


data[["gender", "race/ethnicity",
     "parental level of education", 
     "lunch", "test preparation course",
      "math score", "reading score", "writing score"]].isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64


data[["gender", "race/ethnicity",
     "parental level of education", 
     "lunch", "test preparation course"]].nunique()

gender                         2
race/ethnicity                 5
parental level of education    6
lunch                          2
test preparation course        2
dtype: int64


# Gender
print(list(data['gender'].unique()))
# Race/ethnicity
print(list(data['race/ethnicity'].unique()))
# Parental level of education
print(list(data['parental level of education'].unique()))
# Lunch
print(list(data['lunch'].unique()))
# Test Preparation Course
print(list(data['test preparation course'].unique()))

['female', 'male']
['group B', 'group C', 'group A', 'group D', 'group E']
["bachelor's degree", 'some college', "master's degree", "associate's degree", 'high school', 'some high school']
['standard', 'free/reduced']
['none', 'completed']


import matplotlib.pyplot as plt
import seaborn as sns


ax = sns.countplot(data.gender)


data['gender'].value_counts()

female    518
male      482
Name: gender, dtype: int64


ax = sns.countplot(data["race/ethnicity"])


data['race/ethnicity'].value_counts()

group C    319
group D    262
group B    190
group E    140
group A     89
Name: race/ethnicity, dtype: int64


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
sns.countplot(data["parental level of education"], hue=data['race/ethnicity'])

<matplotlib.axes._subplots.AxesSubplot at 0x20da9df6af0>


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
sns.countplot(data["parental level of education"], hue=data['gender'])

<matplotlib.axes._subplots.AxesSubplot at 0x20da4524d90>


data['parental level of education'].value_counts()

some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: parental level of education, dtype: int64


ax = sns.countplot(data.lunch, hue=data['race/ethnicity'])


ax = sns.countplot(data.lunch, hue=data['gender'])


data['lunch'].value_counts()

standard        645
free/reduced    355
Name: lunch, dtype: int64


ax = sns.countplot(data["test preparation course"], hue=data['race/ethnicity'])


ax = sns.countplot(data["test preparation course"], hue=data['gender'])


data['test preparation course'].value_counts()

none         642
completed    358
Name: test preparation course, dtype: int64


stats_tr = data.describe()
stats_tr.loc['range']= stats_tr.loc['max'] - stats_tr.loc['min']

out_fields = ['mean','25%','50%','75%','range']
stats_tr = stats_tr.loc[out_fields]
stats_tr.rename({'50%':'median'}, inplace=True)
stats_tr


plt.hist(data['math score'], bins=10)

(array([  2.,   2.,  10.,  26.,  95., 188., 268., 216., 135.,  58.]),
 array([  0.,  10.,  20.,  30.,  40.,  50.,  60.,  70.,  80.,  90., 100.]),
 <a list of 10 Patch objects>)


plt.hist(data['reading score'], bins=10)

(array([  4.,   7.,  22.,  64., 140., 182., 237., 168., 120.,  56.]),
 array([ 17. ,  25.3,  33.6,  41.9,  50.2,  58.5,  66.8,  75.1,  83.4,
         91.7, 100. ]),
 <a list of 10 Patch objects>)


plt.hist(data['writing score'], bins=10)

(array([  2.,   6.,  14.,  55., 126., 161., 223., 225., 120.,  68.]),
 array([ 10.,  19.,  28.,  37.,  46.,  55.,  64.,  73.,  82.,  91., 100.]),
 <a list of 10 Patch objects>)


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Math Grade"], hue=data['gender'],order=['A','B','C','D','F']).set_title('Math Grades and Gender')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Reading Grade"],hue=data['gender'],order=['A','B','C','D','F']).set_title('Reading Grades and Gender')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Writing Grade"],hue=data['gender'],order=['A','B','C','D','F']).set_title('Writing Grades and Gender')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Math Grade"], hue=data['race/ethnicity'], order=['A','B','C','D','F'])
ax.set_title('Math Grades and race/ethnicity')

Text(0.5, 1.0, 'Math Grades and race/ethnicity')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Reading Grade"], hue=data['race/ethnicity'],order=['A','B','C','D','F'] )
ax.set_title('Reading Grades and race/ethnicity')

Text(0.5, 1.0, 'Reading Grades and race/ethnicity')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Writing Grade"], hue=data['race/ethnicity'], order=['A','B','C','D','F'])
ax.set_title('Writing Grades and race/ethnicity')

Text(0.5, 1.0, 'Writing Grades and race/ethnicity')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Math Grade"], hue=data['parental level of education'], order=['A','B','C','D','F'] )
ax.set_title('Math Grades and parental level of education')

Text(0.5, 1.0, 'Math Grades and parental level of education')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Reading Grade"], hue=data['parental level of education'], order=['A','B','C','D','F'])
ax.set_title('Reading Grades and parental level of education')

Text(0.5, 1.0, 'Reading Grades and parental level of education')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Writing Grade"], hue=data['parental level of education'], order=['A','B','C','D','F'])
ax.set_title('Writing Grades and parental level of education')

Text(0.5, 1.0, 'Writing Grades and parental level of education')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Math Grade"], hue=data['lunch'], order=['A','B','C','D','F'])
ax.set_title('Math Grades and Lunch')

Text(0.5, 1.0, 'Math Grades and Lunch')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Reading Grade"], hue=data['lunch'], order=['A','B','C','D','F'])
ax.set_title('Reading Grades and Lunch')

Text(0.5, 1.0, 'Reading Grades and Lunch')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Writing Grade"], hue=data['lunch'], order=['A','B','C','D','F'])
ax.set_title('Writing Grades and Lunch')

Text(0.5, 1.0, 'Writing Grades and Lunch')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Math Grade"], hue=data['test preparation course'], order=['A','B','C','D','F'])
ax.set_title('Math Grades and Test preparation course')

Text(0.5, 1.0, 'Math Grades and Test preparation course')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Reading Grade"], hue=data['test preparation course'], order=['A','B','C','D','F'])
ax.set_title('Reading Grades and Test preparation course')

Text(0.5, 1.0, 'Reading Grades and Test preparation course')


dim=(15,10)
fig, ax=plt.subplots(figsize=dim)
ax = sns.countplot(data["Writing Grade"], hue=data['test preparation course'],order=['A','B','C','D','F'])
ax.set_title('Writing Grades and Test preparation course')

Text(0.5, 1.0, 'Writing Grades and Test preparation course')


male=data[data['gender']=='male']
female=data[data['gender']=='female']


import numpy as np
from scipy import stats
np.random.seed(12)
sample_male=np.random.choice(male['math score'], size=100)
sample_female=np.random.choice(female['math score'], size=100)
stats.ttest_ind(sample_male,sample_female, equal_var=False)

Ttest_indResult(statistic=3.787604918380427, pvalue=0.00020398980693459655)


completed=data[data['test preparation course']=='completed']
none=data[data['test preparation course']=='none']


np.random.seed(100)
sample_completed=np.random.choice(completed['math score'],size=100)
sample_none=np.random.choice(none['math score'],size=100)
stats.ttest_ind(sample_completed, sample_none, equal_var=False)

Ttest_indResult(statistic=2.7138174802757202, pvalue=0.007237933982593933)


standard=data[data['lunch']=='standard']
none=data[data['lunch']=='free/reduced']


np.random.seed(150)
sample_standard=np.random.choice(standard['math score'],size=100)
sample_none=np.random.choice(none['math score'],size=100)
stats.ttest_ind(sample_standard,sample_none,equal_var=False)

Ttest_indResult(statistic=5.475441703975005, pvalue=1.333670364723057e-07)

	math score	reading score	writing score
mean	66.089	69.169	68.054
25%	57.000	59.000	57.750
median	66.000	70.000	69.000
75%	77.000	79.000	79.000
range	100.000	83.000	90.000

Student Performance Analysis¶

Section 1: Description¶

Section 2: Initial Plan for EDA¶

Section 3: Feature Engineering¶

Checking for null value¶

Section 4: Key Findings and Insight¶

Find the number of unique values per feature¶

Unique values per categorical column¶

Data Visualization¶

Gender¶

Race/ethnicity¶

Parental Level of Education¶

Lunch¶

Test Preparation Course¶

Determining the mean, median, quantiles, and range for each test scores¶

Math Scores¶

Reading Scores¶

Writing Scores¶

Letter Grades vs Gender¶

Letter Grades and Race/ethnicity¶

Letter Grades and Parental level of Education¶

Lunch vs Test Scores¶

Test Preparation Course vs Test Scores¶

Section 5: Formulating Hypotheses¶

Hypothesis 1:¶

Hypothesis 2:¶

Hypothesis 3:¶

Section 6: Conducting Test for Significance¶

Hypothesis 1:¶

Hypothesis 2:¶

Hypothesis 3:¶

Section 7: Future Analysis¶

Section 8: Summary¶

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	female	group B	bachelor's degree	standard	none	72	72	74
1	female	group C	some college	standard	completed	69	90	88
2	female	group B	master's degree	standard	none	90	95	93
3	male	group A	associate's degree	free/reduced	none	47	57	44
4	male	group C	some college	standard	none	76	78	75